### this code cannot be run without the raw 2015 Local Parliament Project dataset, which is not being shared as part of JOP replication archive. we are sharing this code and the survey codebook so one can see how it was created. 

library(tidyverse)

## note that this data is not shared in this replication archive.  
d <- readstata13::read.dta13("LPP_all_weights_included.dta") %>% 
  filter(!is.na(constituencynumber))

write(paste0("To start we have ", nrow(d), " observations in the LPP.\n"), file = "missingness_log.txt", append = T)


### we will make a dataset of demographic categories and preference ratings (with constituency id and vote choice?), which will then be multiply imputed 


#### defining variables for analysis and multiple imputation #### 

d$id = 1:nrow(d)

id.cnames = c("id", "constituencynumber", "party_id")
d$did.not.vote.or.dont.know.2011 = as.integer(d$vote2011 %in% 7:9)
d$region = as.character(d$province)
d$region[as.character(d$province) %in% c("New Brunswick", "Newfoundland and Labrador", "Nova Scotia", "Prince Edward Island")] = "Atlantic"
d$region[as.character(d$province) %in% c("Alberta", "Manitoba", "Saskatchewan")] = "Prairies"
d$region = as.factor(d$region)
d$education[d$education == 9] = NA

### demographics 

d$age_cat = NA
d$age_cat[d$yob <= 36] = 3 # "60+"
d$age_cat[d$yob > 36 & d$yob <= 56] = 2 # "40-60
d$age_cat[d$yob > 56] = 1 # below 40

# income 
d$income_cat = NA
d$income_cat[d$income < 4] = 1 # under 60k
d$income_cat[d$income %in% c(4,5)] = 2 # 60k-100k
d$income_cat[d$income %in% c(6,7,9,10)] = 3 # 100k and up 

# education 
d$edu_cat = NA
d$edu_cat[d$education %in% c(1,2)] = 1 # high school max
d$edu_cat[d$education %in% c(3,4)] = 2 # some college
d$edu_cat[d$education %in% c(5,6,7,8)] = 3 # college plus 


## other characteristics we subset by as controls (Z)
# attention_1 is fine as is. 

# did the person predict local winner correctly? 
# a bit tricky because different question for Quebec and others 
# also we are missing cons_pred2_1, i.e. how likely that the Liberal won.
# write code anyway, swap in correct data when we have it. 
# as first cut, a binary measure -- did the respondent's top choices include the actual winner? 

results <- read_csv("table_tableau12_2015.csv") 

results %>% 
  select(constituencynumber = `Electoral District Number/Numéro de circonscription`,
         cand = `Candidate/Candidat`,
         votes = `Votes Obtained/Votes obtenus`) %>% 
  mutate(party = case_when(
    str_detect(cand, "Liberal") ~ "lib",
    str_detect(cand, "Conservative") ~ "con",
    str_detect(cand, "NDP") ~ "ndp",
    str_detect(cand, "Green Party") ~ "grn",
    str_detect(cand, "Bloc Qu") ~ "bq",
    str_detect(cand, "People's Party") ~ "pp",
    str_detect(cand, "Independent") ~ "ind",
    TRUE ~ "other"
  )) %>% 
  group_by(constituencynumber) %>%
  filter(votes == max(votes)) %>% 
  ungroup() %>% 
  select(constituencynumber, winner_party = party) -> winners

d %>% 
  select(id, constituencynumber, province, starts_with("cons_pred")) %>% 
  pivot_longer(starts_with("cons_pred"), names_prefix = "cons_pred") %>% 
  filter(province == "Quebec" & str_detect(name, "1_") | province != "Quebec" & str_detect(name, "2_")) %>% 
  mutate(party = recode(name, "2_1" = "lib", "2_2.1" = "con", "2_2" = "con", "2_3" = "ndp", "2_4" = "grn", "1_1" = "lib", "1_2" = "con", "1_3" = "ndp", "1_4" = "bq", "1_5" = "grn")) %>% 
  select(-name, -province) %>%
  mutate(value = ifelse(value == -99, NA, value)) %>% 
  group_by(id, constituencynumber) %>% 
  filter(value == max(value, na.rm = T)) %>% 
  ungroup() %>% 
  left_join(winners, by = "constituencynumber") %>% 
  mutate(correct = party == winner_party) %>% 
  group_by(id, constituencynumber) %>% 
  summarize(how_many_correct = sum(correct)) %>% 
  ungroup() %>% 
  mutate(correctly_predicted_result = (how_many_correct > 0) %>% as.integer()) %>% 
  select(-how_many_correct) -> cpr_df

d.o = d
d <- merge(d, cpr_df, by = c("constituencynumber", "id"))


demo.cnames = c("region", "did.not.vote.or.dont.know.2011", # these are ones we don't plan to subset by, but they might help with imputation? 
                "yob", "income", "education", "gender", "age_cat", "income_cat", "edu_cat", # and then we'll create a left-right one based on imputed preferences
                "correctly_predicted_result", "attention_1")
# exclude 32 people with gender=="other"
d <- d[!d$gender == 3, ]


# preference data 
parties = c("lib", "con", "ndp", "bq", "grn")
leader.parties = c("con", "lib", "ndp", "bq", "grn")
cand.parties = c("con", "lib", "grn", "bq", "ndp")
d$cand_nolab_5 = d$cand_nolab_6; d$cand_plab_5 = d$cand_plab_6
d$party_id = NA
for(j in 1:length(parties)){
  d$party_id[d$id_party == j] = parties[j]
  d[[paste0("cps_party_therm_", parties[j])]] = d[[paste0("parties_", j)]]
  d[[paste0("pes_party_therm_", parties[j])]] = d[[paste0("pes_partyf_", j)]] # need to get this
  d[[paste0("cps_leader_therm_", leader.parties[j])]] = d[[paste0("leaders_", j)]]
  d[[paste0("pes_leader_therm_", leader.parties[j])]] = d[[paste0("pes_leadf_", j)]] # need to get this
  d[[paste0("cps_cand_therm_", cand.parties[j])]] = d[[paste0("cand_nolab_", j)]]
  no.no.lab = is.na(d[[paste0("cps_cand_therm_", cand.parties[j])]])
  d[[paste0("cps_cand_therm_", cand.parties[j])]][no.no.lab] = d[[paste0("cand_plab_", j)]][no.no.lab]
  
}
party.pref.cps.cnames = paste0("cps_party_therm_", parties)
leader.pref.cps.cnames = paste0("cps_leader_therm_", parties)
party.pref.pes.cnames = paste0("pes_party_therm_", parties)
leader.pref.pes.cnames = paste0("pes_leader_therm_", parties)
cand.pref.cps.cnames = paste0("cps_cand_therm_", parties)



#### Assess missingness #### 
is_missing <- function(vec){
  is.na(vec) | vec == -99
}

missing.mat = matrix(NA, nrow = 3, ncol = length(parties))
colnames(missing.mat) = parties
rownames(missing.mat) = c("Missing party rating", "Missing party or leader rating", "Missing party, leader, or candidate rating")
for(j in 1:5){
  missing.party = is_missing(d[[paste0("cps_party_therm_", parties[j])]])
  missing.leader = is_missing(d[[paste0("cps_leader_therm_", parties[j])]])
  missing.cand = is_missing(d[[paste0("cps_cand_therm_", parties[j])]])
  missing.mat[,j] = c(mean(missing.party, na.rm = T), mean(missing.leader | missing.party, na.rm = T), mean(missing.cand | missing.leader | missing.party, na.rm = T)) 
}



#### VOTE CHOICE #### 
# not going to multiply impute I guess, but let's keep it around. 

# pre-election survey: lots of undecided 
d$vote_pre = d$v_noleader
d$vote_pre[is_missing(d$v_noleader)] = d$v_leader[is_missing(d$v_noleader)]
d$shown_leader_party = !is_missing(d$v_leader)
d$vote_party_pre = NA
responses_pre = c("lib", "con", "ndp", "bq", "grn", "und", "dk")
for(j in 1:7){
  d$vote_party_pre[d$vote_pre == j] = responses_pre[j]	
}
# vote choice is only asked of likely voters. 

# post-election survey -- lots of missing
d$vote_party_post = NA
responses = c("lib", "con", "ndp", "bq", "grn", "oth")
for(j in 1:7){
  d$vote_party_post[d$pes_vote == j] = responses[j]	
}

d$vote_party_both = d$vote_party_post
d$vote_party_both[is.na(d$vote_party_post)] = d$vote_party_pre[is.na(d$vote_party_post)]
d$vote_party_both[d$vote_party_both %in% c("und", "oth")] = NA

vote.cnames = c("vote_party_pre", "vote_party_post", "vote_party_both")


D.for.mi = d[,c(id.cnames, vote.cnames, demo.cnames, party.pref.cps.cnames, leader.pref.cps.cnames, party.pref.pes.cnames, leader.pref.pes.cnames, cand.pref.cps.cnames)]

for(vname in c(party.pref.cps.cnames, leader.pref.cps.cnames, party.pref.pes.cnames, leader.pref.pes.cnames, cand.pref.cps.cnames)){
  D.for.mi[[vname]] = na_if(D.for.mi[[vname]], -99)
}

## why do MI for cases in which we have almost no preference data? 
number.not.missing = function(vec){sum(!is.na(vec))}
nnm = apply(D.for.mi[,c(party.pref.cps.cnames, leader.pref.cps.cnames, cand.pref.cps.cnames)], 1, number.not.missing)  # exclusion still based on pre-election survey
too.few.preferences = nnm < 5
# prop.table(table(too.few.preferences, d$region == "Quebec"), 2)
# fewer excluded from Quebec. 
# should we have different standards for Quebec and others, because 
# Quebec was asked about 5 candidates vs 4 elsewhere? 
too.few.preferences.2 = (d$region == "Quebec" & nnm < 6) | (d$region != "Quebec" & nnm < 5)
# prop.table(table(too.few.preferences.2, d$region == "Quebec"), 2)
# narrows gap somewhat, but not all the way. so a single rule is simpler.
# did not otherwise treat Quebec differently. 

write(paste0(sum(too.few.preferences), " have fewer than 5 preferences.\n"), file = "missingness_log.txt", append = T)

voted.at.some.point = !is.na(D.for.mi$vote_party_both)

write(paste0(sum(!voted.at.some.point & !too.few.preferences), " of those who remain don't express a vote.\n"), file = "missingness_log.txt", append = T)

use = voted.at.some.point & !too.few.preferences

DD = D.for.mi[use, ]

write(paste0(nrow(DD), " rows in the data going into MI.\n"), file = "missingness_log.txt", append = T)

save(DD, file = "2015_LPP_preprocessed.RData")

